Load up the packages we will need
library(tidyverse)
library(tidyr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(ggthemes)
library(hrbrthemes)
library(viridis)
library(extrafont)
#library(pathfindR)
library(BiocManager)## Warning in file(con, "r"): URL 'https://bioconductor.org/config.yaml': status
## was 'Couldn't resolve host name'
## Warning in file(con, "r"): URL 'http://bioconductor.org/config.yaml': status was
## 'Couldn't resolve host name'
library(mixOmics)
library(reshape2)
library(showtext)
# font_add_google("Manrope")Import the initial data frame
profiles <- read.table(file = '/Users/ee/git/r/Project_periodontitis/Data/profiles.csv',
header = TRUE, sep = ',', quote = "", row.names = NULL,
stringsAsFactors = FALSE, check.names = FALSE)
head(profiles)Remove column “sample” from the data frame
profiles_1 <- profiles[, -which(names(profiles) == "sample")]
head(profiles_1)Combine multiple the data frame variables (columns) into a single column
profiles_melt <- melt(profiles_1)
head(profiles_melt)Reorder the groups based on the group (pathology) name
profiles_melt$group <- factor(profiles_melt$group, levels = c("Healthy",
"Gingivitis",
"Mucositis",
"Periimplantitis"))Calculate the mean and standard deviation of the fluorescence results (value column) per molecular weight of all samples in each group
summary_table <- profiles_melt %>%
group_by(group, variable) %>%
summarize(value.mean = mean(value),
value.sd = sd(value))
head(summary_table)Convert the ‘value’ column to a numeric data type and round the ‘value’ column to integers
profiles_melt$variable <- as.numeric(as.character(profiles_melt$variable))
profiles_melt$rounded_variable <- as.integer(round(profiles_melt$variable))
head(profiles_melt)Recalculate the mean and standard deviation of the fluorescence results (value column) per molecular weight of all samples in each group. This will make the plotting easier to read and more visual appealing
profiles_melt$group <- factor(profiles_melt$group, levels = c("Healthy",
"Gingivitis",
"Mucositis",
"Periimplantitis"))
summary_table <- profiles_melt %>%
group_by(group, rounded_variable) %>%
summarize(value.mean = mean(value),
value.sd = sd(value))
head(summary_table)p_plot_1 <- ggplot(summary_table, aes(x = rounded_variable,
y = rounded_variable,
fill = group)) +
geom_ribbon(aes(ymin = value.mean - value.sd,ymax = value.mean + value.sd), alpha = 0.3) +
geom_line(aes(y = value.mean, color = group), linetype = "solid") +
scale_fill_manual(values = c("green", "blue", "orange","red")) +
scale_color_manual(values = c("green", "blue", "orange","red")) +
coord_cartesian(xlim = c(10, 140)) +
scale_x_continuous(breaks = seq(10, 140, 10)) +
labs(x = "Molecular Weight", y = "Fluorescence", title = "Protein Capillary Electrophoresis") +
theme(panel.grid = element_blank()) # Plot the ribbon and mean lines
plot(p_plot_1)Figure 1: Profile plot - Grouped line plot with mean (line) and standard deviation (shadow)
“Normalizing data with negative values can be a bit tricky.” ## Normalization option 1.1 - Total Ion Current (TIC) A common normalization factor for CE data is the total ion current (TIC), which represents the sum of all peak heights in a sample.
tic <- rowSums(profiles_1[,2:ncol(profiles_1)]) # Calculate the normalization factor: To calculate the TIC, use the rowSums function in R to sum the peak heights across all variables for each sample
data_norm1 <- profiles_1[,2:ncol(profiles_1)] / tic # Divide each variable in the data frame by the TIC to obtain normalized values.
data_norm <- cbind(profiles_1[,1], data_norm1) # Save the normalized data: add the sample ID column back
colnames(data_norm) <- colnames(profiles_1) # Save the normalized data: copy the column names
head(data_norm)Summary statistics on the normalized data
profiles_melt_tic <- melt(data_norm)
# Reorder the groups based on name
profiles_melt_tic$group <- factor(profiles_melt_tic$group, levels = c("Healthy",
"Gingivitis",
"Mucositis",
"Periimplantitis"))
# Convert the 'value' column to a numeric data type
profiles_melt_tic$variable <- as.numeric(as.character(profiles_melt_tic$variable))
# Round the 'value' column to integers
profiles_melt_tic$rounded_variable <- as.integer(round(profiles_melt_tic$variable))
summary_table_tic <- profiles_melt_tic %>%
group_by(group, rounded_variable) %>%
summarize(value.mean = mean(value),
value.sd = sd(value))
head(summary_table_tic)Plotting
p_plot_tic <- ggplot(summary_table_tic, aes(x = rounded_variable,
y = rounded_variable,
fill = group)) +
geom_ribbon(aes(ymin = value.mean - value.sd,ymax = value.mean + value.sd), alpha = 0.3) +
geom_line(aes(y = value.mean, color = group), linetype = "solid") +
scale_fill_manual(values = c("green", "blue", "orange","red")) +
scale_color_manual(values = c("green", "blue", "orange","red")) +
coord_cartesian(xlim = c(10, 140)) +
scale_x_continuous(breaks = seq(10, 140, 10)) +
labs(x = "Molecular Weight", y = "Normalized Scale Intensity", title = "Protein Capillary Electrophoresis") +
theme(panel.grid = element_blank()) # Remove the x and y lines
plot(p_plot_tic)Figure 2: Profile plot with TIC normalized data - Grouped line plot with mean (line) and standard deviation (shadow)
Log transformation on the Total Ion Current (TIC) normalized data.
data_log <- log(data_norm1 + 1)
data_log <- cbind(profiles_1[,1], data_log)
colnames(data_log) <- colnames(profiles_1)
profiles_melt_tic1 <- melt(data_log)## Using group as id variables
Summary statistics on the normalized data
# Reorder the groups based on name
profiles_melt_tic1$group <- factor(profiles_melt_tic1$group, levels = c("Healthy",
"Gingivitis",
"Mucositis",
"Periimplantitis"))
# Convert the 'value' column to a numeric data type
profiles_melt_tic1$variable <- as.numeric(as.character(profiles_melt_tic1$variable))
# Round the 'value' column to integers
profiles_melt_tic1$rounded_variable <- as.integer(round(profiles_melt_tic1$variable))
summary_table_tic1 <- profiles_melt_tic1 %>%
group_by(group, rounded_variable) %>%
summarize(value.mean = mean(value),
value.sd = sd(value))
head(summary_table_tic1)Plotting
p_plot_tic1 <- ggplot(summary_table_tic1, aes(x = rounded_variable,
y = rounded_variable,
fill = group)) +
geom_ribbon(aes(ymin = value.mean - value.sd,ymax = value.mean + value.sd), alpha = 0.3) +
geom_line(aes(y = value.mean, color = group), linetype = "solid") +
scale_fill_manual(values = c("green", "blue", "orange","red")) +
scale_color_manual(values = c("green", "blue", "orange","red")) +
coord_cartesian(xlim = c(10, 140)) +
scale_x_continuous(breaks = seq(10, 140, 10)) +
labs(x = "Molecular Weight", y = "Normalized Scale Intensity", title = "Protein Capillary Electrophoresis") +
theme(panel.grid = element_blank()) # Remove the x and y lines
plot(p_plot_tic1)Figure 3: Profile plot with TIC + log normalized data - Grouped line plot with mean (line) and standard deviation (shadow)
Another option is to use the scale() function to scale the y-values to a specified range. This function below normalize each y-column to a range of 0 to 1:
normalize_ce_data <- function(data) {
# Normalize each y-axis to range of 0 to 1
y_cols <- colnames(data)[-1]
y_norm <- lapply(data[y_cols], function(y) {
scale(y, center = FALSE, scale = max(abs(y), na.rm = TRUE)) + 1
})
# Merge normalized data by x-axis
data_norm <- cbind(data[1], y_norm)
# Set column names
colnames(data_norm)[-1] <- paste0("y_", seq_len(ncol(data_norm)-1), "_norm")
# Return normalized data
return(data_norm)
}Apply the function to normalize CE data
ce_data_norm <- normalize_ce_data(profiles_1)Replace column names of normalized data frame with original column names
names(ce_data_norm) <- names(profiles_1)
profiles_melt_norm <- melt(ce_data_norm) Summary statistics on the normalized data
# Reorder the groups based on name
profiles_melt_norm$group <- factor(profiles_melt$group, levels = c("Healthy",
"Gingivitis",
"Mucositis",
"Periimplantitis"))
# Convert the 'value' column to a numeric data type
profiles_melt_norm$variable <- as.numeric(as.character(profiles_melt_norm$variable))
# Round the 'value' column to integers
profiles_melt_norm$rounded_variable <- as.integer(round(profiles_melt_norm$variable))
summary_table_norm <- profiles_melt_norm %>%
group_by(group, rounded_variable) %>%
summarize(value.mean = mean(value),
value.sd = sd(value))Plotting
p_plot_norm <- ggplot(summary_table_norm, aes(x = rounded_variable,
y = rounded_variable,
fill = group)) +
geom_ribbon(aes(ymin = value.mean - value.sd,ymax = value.mean + value.sd), alpha = 0.3) +
geom_line(aes(y = value.mean, color = group), linetype = "solid") +
scale_fill_manual(values = c("green", "blue", "orange","red")) +
scale_color_manual(values = c("green", "blue", "orange","red")) +
coord_cartesian(xlim = c(10, 140)) +
scale_x_continuous(breaks = seq(10, 140, 10)) +
labs(x = "Molecular Weight", y = "Normalized Scale Intensity", title = "Protein Capillary Electrophoresis") +
theme(panel.grid = element_blank()) # Remove the x and y lines
plot(p_plot_norm)Figure 4: Profile plot with SCALE normalized data - Grouped line plot with mean (line) and standard deviation (shadow)
SCALE normalization is definitely not the best strategy to proceed with the analysis. I suggest to do different analysis approaches. Start from the original data and then moving to the normalized TIC data.
Partial Least Squares Discriminant Analysis (PLS-DA) is a linear, multivariate model which uses the PLS algorithm to allow classification of categorically labelled data. PLS-DA seeks for components that best separate the sample groups, whilst the sparse version also selects variables that best discriminate between groups.
The data - Define predictor (X) and response (Y) variables
plsda <- profiles_1[,2:398] # Select the MW columns
colnames(plsda) <- paste("mw", colnames(plsda), sep="_") # Add "mw_" to the column names
X <- plsda # Assign plsda df to X
profiles_1$group <- factor(profiles_1$group) # Make the group column a factor
Y <- profiles_1$group # Assign plsda df to Ycheck the dimensions of the X data frame
dim(X)## [1] 49 397
check the distribution of class labels
summary(Y)## Gingivitis Healthy Mucositis Periimplantitis
## 10 10 15 14
As in most cases when developing models, exploring the data to determine the major sources of variation is a good first step. PCA will be used for this. Centering and scaling is recommended to homogenize the variance across the samples. ncomp is set to an arbitrarily high number to understand the captured variance across cotheremponents. Run PCA method on data and plot the eigenvalues (explained variance per component)
pca.srbct = pca(X, ncomp = 10, center = TRUE, scale = TRUE) # run pca method on data
plot(pca.srbct) # barplot of the eigenvalues (explained variance per component)FIGURE 5: Barplot of the variance each principal component explains of the fluorescence intesity per molecular weight profile data.
One component would be sufficient to explain a moderate proportion of the data’s variance according to Figure 1. However we will use the first two componnets in order to keep the analysis flow. Next, the data is projected onto these two components to attempt to observe sources of variation. Plot the samples projected onto the PCA subspace
plotIndiv(pca.srbct, group = profiles_1$group, ind.names = FALSE, # plot the samples projected
legend = TRUE, title = 'PCA on SRBCT, comp 1 - 2') # onto the PCA subspaceFIGURE 6: Preliminary (unsupervised) analysis with PCA on the SRBCT gene expression data
It seems that different sample groups do not separate or cluster across the two Principal components of the data, as seen in Figure 6. There are clusters, but these are not explained by the class variable. It can be inferred then that the major source of variation in intensity between the same group for the same Molecular Weight. Note that Figure 6 has each sample group coloured by the class. This is only done for visualisation after the PCA as it is an unsupervised approach.
A PLS-DA model is fitted with ten components to evaluate the performance and the number of components necessary for the final model. A sample plot, including confidence ellipses, is shown in Figure 7(a). This plot shows much better clustering of samples according to the sample groups when compared to the PCA output.
srbct.splsda <- splsda(X, Y, ncomp = 10)plotIndiv(srbct.splsda , comp = 1:2, #Plot the samples projected onto the first two components of the PLS-DA subspace
group = profiles_1$group, ind.names = FALSE, # colour points by class
ellipse = TRUE, # include 95% confidence ellipse for each class
legend = TRUE, title = '(a) PLSDA with confidence ellipses')FIGURE 7a: Sample plots of the protein profile data after a basic PLS-DA model was operated on this data. Depicts the samples with the confidence ellipses of different class labels
Use the max.dist measure to form decision boundaries between classes based on PLS-DA data
background = background.predict(srbct.splsda, comp.predicted=2, dist = "max.dist")The background.predict() function can also be utilized to depict the separation of class labels as seen in Figure 7(b). This plot provides intuition on how novel samples would be classified according to the model generated by sPLS-DA.
plotIndiv(srbct.splsda, comp = 1:2, #Plot the samples projected onto the first two components of the PLS-DA subspace
group = profiles_1$group, ind.names = FALSE, # colour points by class
background = background, # include prediction background for each class
legend = TRUE, title = " (b) PLSDA with prediction background")FIGURE 7b: Sample plots of the protein profile data after a basic PLS-DA model was operated on this data. Depicts the prediction background generated by these samples. Both plots use the first two components as axes.
The ncomp Parameter
The number of components to use is a crucial decision and is dictated by the performance of the PLS-DA model – i.e. its ability to correctly classify novel samples. The perf() function is used for this exactly. This is done with repeated cross-validation. Based on the output of this function, the optimal number of components to use can be identified.
A three-fold, 10 repeat cross-validation procedure is utilized here. Generally, for data sets with numerous samples, at least 10 folds is recommended. 3 or 5 folds is appropriate for smaller data sets and those with minimal samples should use Leave-One-Out (LOO) validation. Consider using 50-100 repeats to reduce the impact of the randomly allocated folds during each repeat.
The overall error rate (OER) and balanced error rate (BER) for the three different distance metrics (explained further below) across the first ten components are depicted in Figure 8.
#Undergo performance evaluation in order to tune the number of components to use
perf.splsda.srbct <- perf(srbct.splsda, validation = "Mfold",
folds = 5, nrepeat = 100, # use repeated cross-validation
progressBar = FALSE, auc = TRUE) # include AUC values# Plot the outcome of performance evaluation across all ten components
plot(perf.splsda.srbct, col = color.mixo(5:7), sd = TRUE,
legend.position = "horizontal")FIGURE 8: Tuning the number of components in PLS-DA on the SRBCT gene expression data. For each component, repeated cross-validation (10 × 3−fold CV) is used to evaluate the PLS-DA classification performance (OER and BER), for each type of prediction distance; max.dist, centroids.dist and mahalanobis.dist.
From this, it seems three components are appropriate as the error for each distance metric decreases by very incremental amounts after this. Components beyond the third are likely to provide neglible returns to the classification accuracy. A more empirical way to select this number is through the $choice.ncomp component of the perf() output object. It runs t-tests for a significant different in mean error rate across components. Using the max.dist metric, this suggests that the optimal number of components is 1. When to use each distance metric is explained further below.
# what is the optimal value of components according to perf()
perf.splsda.srbct$choice.ncomp## max.dist centroids.dist mahalanobis.dist
## overall 1 1 2
## BER 1 1 2
Grid of possible keepX values that will be tested for each component
list.keepX <- c(1:10, seq(20, 300, 10))Undergo the tuning process to determine the optimal number of variables
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 4, # calculate for first 4 components
validation = 'Mfold',
folds = 5, nrepeat = 10, # use repeated cross-validation
dist = 'max.dist', # use max.dist measure
measure = "BER", # use balanced error rate of dist measure
test.keepX = list.keepX,
cpus = 2) # allow for paralleliation to decrease runtimePlot output of variable number tuning
plot(tune.splsda.srbct, col = color.jet(4))What is the optimal value of components according to tune.splsda()
tune.splsda.srbct$choice.ncomp$ncomp## [1] 1
What are the optimal values of variables according to tune.splsda()
tune.splsda.srbct$choice.keepX## comp1 comp2 comp3 comp4
## 10 300 30 110
#store the results in to the variables
optimal.ncomp <- tune.splsda.srbct$choice.ncomp$ncomp
optimal.keepX <- tune.splsda.srbct$choice.keepX[1:optimal.ncomp]Form final model with optimized values for component and variable count It’s important to mention that because the number of optimal components defined previously is equal to 1, it would make the final model impossible to reproduce. I decided to use number of components ==3 as a standard to proceed.
final.splsda <- splsda(X, Y,
ncomp = 3, # The "optimal.ncomp" identify only 1 component ideal according to the tune.splda
keepX = optimal.keepX)Plot samples from final model
plotIndiv(final.splsda, comp = c(1,2), # plot samples from final model
group = profiles_1$group, ind.names = FALSE, # colour by class label
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = ' (a) sPLS-DA on SRBCT, comp 1 & 2')# plotIndiv(final.splsda, comp = c(1,3), # plot samples from final model
# group = profiles_1$group, ind.names = FALSE, # colour by class label
# ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
# title = '(b) sPLS-DA on SRBCT, comp 1 & 3')Set the styling of the legend to be homogeneous with previous plots
legend=list(legend = levels(Y), # set of classes
col = unique(color.mixo(Y)), # set of colours
title = "Profile grouping", # legend title
cex = 0.7) # legend sizeGenerate the CIM, using the legend and colouring rows by each sample’s class
cim <- cim(final.splsda, row.sideColors = color.mixo(Y),
legend = legend)## Error in cim plot: figure margins too large. See ?cim for help.
# ggsave("cim.png", cim, width = 10, height = 5, dpi = 300)Form new perf() object which utilises the final model
perf.splsda.srbct <- perf(final.splsda,
folds = 5, nrepeat = 10, # use repeated cross-validation
validation = "Mfold", dist = "max.dist", # use max.dist measure
progressBar = FALSE)Plot the stability of each feature for the first three components, ‘h’ type refers to histogram
par(mfrow=c(1,3))
plot(perf.splsda.srbct$features$stable[[1]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(a) Comp 1', las =2)
plot(perf.splsda.srbct$features$stable[[2]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(b) Comp 2', las =2)
plot(perf.splsda.srbct$features$stable[[3]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(c) Comp 3', las =2)# var.name.short <- substr(profiles_1[,2:398], 1, 10) # form simplified gene names
#
# plotVar(final.splsda, comp = c(1,2), var.names = list(var.name.short), cex = 3) # generate correlation circle plotDataset preparation
train <- sample(1:nrow(X), 5) # randomly select 50 samples in training
test <- setdiff(1:nrow(X), train) # rest is part of the test setStore matrices into training and test set:
X.train <- X[train, ]
X.test <- X[test,]
Y.train <- Y[train]
Y.test <- Y[test]Train the model
train.splsda.srbct <- splsda(X.train, Y.train, ncomp = optimal.ncomp, keepX = optimal.keepX)Use the model on the Xtest set
predict.splsda.srbct <- predict(train.splsda.srbct, X.test,
dist = "mahalanobis.dist")Evaluate the prediction accuracy for the first two components
predict.comp2 <- predict.splsda.srbct$class$mahalanobis.dist[,1]
table(factor(predict.comp2, levels = levels(Y)), Y.test)## Y.test
## Gingivitis Healthy Mucositis Periimplantitis
## Gingivitis 1 2 1 0
## Healthy 1 1 3 5
## Mucositis 1 4 4 1
## Periimplantitis 6 2 5 7
AUROC for the first component
auc.splsda = auroc(final.splsda, roc.comp = 1, print = FALSE) # AUROC for the first componentAUROC for all three components
auc.splsda = auroc(final.splsda, roc.comp = 3, print = FALSE) # AUROC for all three components